{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('./data/train_set.csv')\n",
    "test = pd.read_csv('./data/test_set.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>article</th>\n",
       "      <th>word_seg</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>7368 1252069 365865 755561 1044285 129532 1053...</td>\n",
       "      <td>816903 597526 520477 1179558 1033823 758724 63...</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>581131 165432 7368 957317 1197553 570900 33659...</td>\n",
       "      <td>90540 816903 441039 816903 569138 816903 10343...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7368 87936 40494 490286 856005 641588 145611 1...</td>\n",
       "      <td>816903 1012629 957974 1033823 328210 947200 65...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>299237 760651 299237 887082 159592 556634 7489...</td>\n",
       "      <td>563568 1239563 680125 780219 782805 1033823 19...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>7368 7368 7368 865510 7368 396966 995243 37685...</td>\n",
       "      <td>816903 816903 816903 139132 816903 312320 1103...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                            article  \\\n",
       "0   0  7368 1252069 365865 755561 1044285 129532 1053...   \n",
       "1   1  581131 165432 7368 957317 1197553 570900 33659...   \n",
       "2   2  7368 87936 40494 490286 856005 641588 145611 1...   \n",
       "3   3  299237 760651 299237 887082 159592 556634 7489...   \n",
       "4   4  7368 7368 7368 865510 7368 396966 995243 37685...   \n",
       "\n",
       "                                            word_seg  class  \n",
       "0  816903 597526 520477 1179558 1033823 758724 63...     14  \n",
       "1  90540 816903 441039 816903 569138 816903 10343...      3  \n",
       "2  816903 1012629 957974 1033823 328210 947200 65...     12  \n",
       "3  563568 1239563 680125 780219 782805 1033823 19...     13  \n",
       "4  816903 816903 816903 139132 816903 312320 1103...     12  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF构建文本特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vec = TfidfVectorizer(analyzer='word',\n",
    "            ngram_range=(1,2),\n",
    "            min_df=3, \n",
    "            max_df=0.9,\n",
    "            use_idf=True,\n",
    "            smooth_idf=True, \n",
    "            sublinear_tf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_doc = word_vec.fit_transform(train['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_doc = word_vec.transform(test['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<102277x2820641 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 75739932 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    13\n",
       "1     2\n",
       "2    11\n",
       "3    12\n",
       "4    11\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['label'].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 逻辑LightGBM模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "lb = LabelEncoder()\n",
    "train['label'] = lb.fit_transform(train['class'].tolist())[0,...18]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "#cross_validation\n",
    "#切分训练集 验证集\n",
    "X_train, X_eval, y_train, y_eval  = train_test_split(train_term_doc,train['label'],test_size=0.2,shuffle=True,random_state=2019)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# X_train, X_eval, = train_test_split(train_term_doc,test_size=0.2,shuffle=True,random_state=2019)\n",
    "# y_train, y_eval  = train_test_split(train['label'],test_size=0.2,shuffle=True,random_state=2019)\n",
    "#错误的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<81821x2820641 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 60544982 stored elements in Compressed Sparse Row format>,\n",
       " <20456x2820641 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 15194950 stored elements in Compressed Sparse Row format>)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train,X_eval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((81821,), (20456,))"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.shape,y_eval.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lightgbm as lgb\n",
    "#sklearn api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1]\tvalid_0's multi_logloss: 2.59854\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[2]\tvalid_0's multi_logloss: 2.40874\n",
      "[3]\tvalid_0's multi_logloss: 2.262\n",
      "[4]\tvalid_0's multi_logloss: 2.14355\n",
      "[5]\tvalid_0's multi_logloss: 2.04217\n",
      "[6]\tvalid_0's multi_logloss: 1.95521\n",
      "[7]\tvalid_0's multi_logloss: 1.87851\n",
      "[8]\tvalid_0's multi_logloss: 1.80973\n",
      "[9]\tvalid_0's multi_logloss: 1.74865\n",
      "[10]\tvalid_0's multi_logloss: 1.69333\n",
      "[11]\tvalid_0's multi_logloss: 1.64411\n",
      "[12]\tvalid_0's multi_logloss: 1.60052\n",
      "[13]\tvalid_0's multi_logloss: 1.55825\n",
      "[14]\tvalid_0's multi_logloss: 1.51969\n",
      "[15]\tvalid_0's multi_logloss: 1.48463\n",
      "[16]\tvalid_0's multi_logloss: 1.45241\n",
      "[17]\tvalid_0's multi_logloss: 1.42353\n",
      "[18]\tvalid_0's multi_logloss: 1.396\n",
      "[19]\tvalid_0's multi_logloss: 1.37042\n",
      "[20]\tvalid_0's multi_logloss: 1.34695\n",
      "[21]\tvalid_0's multi_logloss: 1.32496\n",
      "[22]\tvalid_0's multi_logloss: 1.30468\n",
      "[23]\tvalid_0's multi_logloss: 1.28517\n",
      "[24]\tvalid_0's multi_logloss: 1.26685\n",
      "[25]\tvalid_0's multi_logloss: 1.24974\n",
      "[26]\tvalid_0's multi_logloss: 1.23317\n",
      "[27]\tvalid_0's multi_logloss: 1.21748\n",
      "[28]\tvalid_0's multi_logloss: 1.20252\n",
      "[29]\tvalid_0's multi_logloss: 1.18815\n",
      "[30]\tvalid_0's multi_logloss: 1.17554\n",
      "[31]\tvalid_0's multi_logloss: 1.16274\n",
      "[32]\tvalid_0's multi_logloss: 1.15046\n",
      "[33]\tvalid_0's multi_logloss: 1.13908\n",
      "[34]\tvalid_0's multi_logloss: 1.12861\n",
      "[35]\tvalid_0's multi_logloss: 1.11829\n",
      "[36]\tvalid_0's multi_logloss: 1.10827\n",
      "[37]\tvalid_0's multi_logloss: 1.09891\n",
      "[38]\tvalid_0's multi_logloss: 1.09011\n",
      "[39]\tvalid_0's multi_logloss: 1.08148\n",
      "[40]\tvalid_0's multi_logloss: 1.07378\n",
      "[41]\tvalid_0's multi_logloss: 1.06543\n",
      "[42]\tvalid_0's multi_logloss: 1.05791\n",
      "[43]\tvalid_0's multi_logloss: 1.05053\n",
      "[44]\tvalid_0's multi_logloss: 1.04329\n",
      "[45]\tvalid_0's multi_logloss: 1.03637\n",
      "[46]\tvalid_0's multi_logloss: 1.02969\n",
      "[47]\tvalid_0's multi_logloss: 1.02331\n",
      "[48]\tvalid_0's multi_logloss: 1.01706\n",
      "[49]\tvalid_0's multi_logloss: 1.01126\n",
      "[50]\tvalid_0's multi_logloss: 1.00585\n",
      "[51]\tvalid_0's multi_logloss: 1.00024\n",
      "[52]\tvalid_0's multi_logloss: 0.995214\n",
      "[53]\tvalid_0's multi_logloss: 0.990295\n",
      "[54]\tvalid_0's multi_logloss: 0.985481\n",
      "[55]\tvalid_0's multi_logloss: 0.98093\n",
      "[56]\tvalid_0's multi_logloss: 0.976229\n",
      "[57]\tvalid_0's multi_logloss: 0.972044\n",
      "[58]\tvalid_0's multi_logloss: 0.967765\n",
      "[59]\tvalid_0's multi_logloss: 0.963654\n",
      "[60]\tvalid_0's multi_logloss: 0.95939\n",
      "[61]\tvalid_0's multi_logloss: 0.955388\n",
      "[62]\tvalid_0's multi_logloss: 0.951457\n",
      "[63]\tvalid_0's multi_logloss: 0.947853\n",
      "[64]\tvalid_0's multi_logloss: 0.944313\n",
      "[65]\tvalid_0's multi_logloss: 0.941154\n",
      "[66]\tvalid_0's multi_logloss: 0.937814\n",
      "[67]\tvalid_0's multi_logloss: 0.934707\n",
      "[68]\tvalid_0's multi_logloss: 0.931652\n",
      "[69]\tvalid_0's multi_logloss: 0.928459\n",
      "[70]\tvalid_0's multi_logloss: 0.925261\n",
      "[71]\tvalid_0's multi_logloss: 0.922455\n",
      "[72]\tvalid_0's multi_logloss: 0.919784\n",
      "[73]\tvalid_0's multi_logloss: 0.917133\n",
      "[74]\tvalid_0's multi_logloss: 0.914452\n",
      "[75]\tvalid_0's multi_logloss: 0.911897\n",
      "[76]\tvalid_0's multi_logloss: 0.909298\n",
      "[77]\tvalid_0's multi_logloss: 0.906855\n",
      "[78]\tvalid_0's multi_logloss: 0.904565\n",
      "[79]\tvalid_0's multi_logloss: 0.902306\n",
      "[80]\tvalid_0's multi_logloss: 0.900313\n",
      "[81]\tvalid_0's multi_logloss: 0.898255\n",
      "[82]\tvalid_0's multi_logloss: 0.896148\n",
      "[83]\tvalid_0's multi_logloss: 0.894135\n",
      "[84]\tvalid_0's multi_logloss: 0.892236\n",
      "[85]\tvalid_0's multi_logloss: 0.890289\n",
      "[86]\tvalid_0's multi_logloss: 0.888209\n",
      "[87]\tvalid_0's multi_logloss: 0.886298\n",
      "[88]\tvalid_0's multi_logloss: 0.884452\n",
      "[89]\tvalid_0's multi_logloss: 0.882723\n",
      "[90]\tvalid_0's multi_logloss: 0.880986\n",
      "[91]\tvalid_0's multi_logloss: 0.879204\n",
      "[92]\tvalid_0's multi_logloss: 0.877774\n",
      "[93]\tvalid_0's multi_logloss: 0.876191\n",
      "[94]\tvalid_0's multi_logloss: 0.87478\n",
      "[95]\tvalid_0's multi_logloss: 0.873144\n",
      "[96]\tvalid_0's multi_logloss: 0.871553\n",
      "[97]\tvalid_0's multi_logloss: 0.870217\n",
      "[98]\tvalid_0's multi_logloss: 0.868815\n",
      "[99]\tvalid_0's multi_logloss: 0.867481\n",
      "[100]\tvalid_0's multi_logloss: 0.865989\n",
      "[101]\tvalid_0's multi_logloss: 0.864489\n",
      "[102]\tvalid_0's multi_logloss: 0.863206\n",
      "[103]\tvalid_0's multi_logloss: 0.861972\n",
      "[104]\tvalid_0's multi_logloss: 0.860909\n",
      "[105]\tvalid_0's multi_logloss: 0.859664\n",
      "[106]\tvalid_0's multi_logloss: 0.858545\n",
      "[107]\tvalid_0's multi_logloss: 0.857412\n",
      "[108]\tvalid_0's multi_logloss: 0.856474\n",
      "[109]\tvalid_0's multi_logloss: 0.855313\n",
      "[110]\tvalid_0's multi_logloss: 0.854288\n",
      "[111]\tvalid_0's multi_logloss: 0.853315\n",
      "[112]\tvalid_0's multi_logloss: 0.852312\n",
      "[113]\tvalid_0's multi_logloss: 0.851366\n",
      "[114]\tvalid_0's multi_logloss: 0.850397\n",
      "[115]\tvalid_0's multi_logloss: 0.849534\n",
      "[116]\tvalid_0's multi_logloss: 0.848468\n",
      "[117]\tvalid_0's multi_logloss: 0.847554\n",
      "[118]\tvalid_0's multi_logloss: 0.846534\n",
      "[119]\tvalid_0's multi_logloss: 0.845688\n",
      "[120]\tvalid_0's multi_logloss: 0.844726\n",
      "[121]\tvalid_0's multi_logloss: 0.843842\n",
      "[122]\tvalid_0's multi_logloss: 0.842973\n",
      "[123]\tvalid_0's multi_logloss: 0.842295\n",
      "[124]\tvalid_0's multi_logloss: 0.841488\n",
      "[125]\tvalid_0's multi_logloss: 0.840788\n",
      "[126]\tvalid_0's multi_logloss: 0.839959\n",
      "[127]\tvalid_0's multi_logloss: 0.839321\n",
      "[128]\tvalid_0's multi_logloss: 0.838762\n",
      "[129]\tvalid_0's multi_logloss: 0.837945\n",
      "[130]\tvalid_0's multi_logloss: 0.837278\n",
      "[131]\tvalid_0's multi_logloss: 0.836712\n",
      "[132]\tvalid_0's multi_logloss: 0.836192\n",
      "[133]\tvalid_0's multi_logloss: 0.835561\n",
      "[134]\tvalid_0's multi_logloss: 0.834976\n",
      "[135]\tvalid_0's multi_logloss: 0.834336\n",
      "[136]\tvalid_0's multi_logloss: 0.833666\n",
      "[137]\tvalid_0's multi_logloss: 0.833063\n",
      "[138]\tvalid_0's multi_logloss: 0.83245\n",
      "[139]\tvalid_0's multi_logloss: 0.831826\n",
      "[140]\tvalid_0's multi_logloss: 0.831289\n",
      "[141]\tvalid_0's multi_logloss: 0.830663\n",
      "[142]\tvalid_0's multi_logloss: 0.830231\n",
      "[143]\tvalid_0's multi_logloss: 0.829898\n",
      "[144]\tvalid_0's multi_logloss: 0.829278\n",
      "[145]\tvalid_0's multi_logloss: 0.828787\n",
      "[146]\tvalid_0's multi_logloss: 0.82827\n",
      "[147]\tvalid_0's multi_logloss: 0.827765\n",
      "[148]\tvalid_0's multi_logloss: 0.82727\n",
      "[149]\tvalid_0's multi_logloss: 0.826817\n",
      "[150]\tvalid_0's multi_logloss: 0.826299\n",
      "[151]\tvalid_0's multi_logloss: 0.825765\n",
      "[152]\tvalid_0's multi_logloss: 0.825342\n",
      "[153]\tvalid_0's multi_logloss: 0.824991\n",
      "[154]\tvalid_0's multi_logloss: 0.8246\n",
      "[155]\tvalid_0's multi_logloss: 0.824239\n",
      "[156]\tvalid_0's multi_logloss: 0.823735\n",
      "[157]\tvalid_0's multi_logloss: 0.823234\n",
      "[158]\tvalid_0's multi_logloss: 0.822774\n",
      "[159]\tvalid_0's multi_logloss: 0.822215\n",
      "[160]\tvalid_0's multi_logloss: 0.821691\n",
      "[161]\tvalid_0's multi_logloss: 0.821294\n",
      "[162]\tvalid_0's multi_logloss: 0.820912\n",
      "[163]\tvalid_0's multi_logloss: 0.820588\n",
      "[164]\tvalid_0's multi_logloss: 0.82034\n",
      "[165]\tvalid_0's multi_logloss: 0.819913\n",
      "[166]\tvalid_0's multi_logloss: 0.819585\n",
      "[167]\tvalid_0's multi_logloss: 0.819288\n",
      "[168]\tvalid_0's multi_logloss: 0.81888\n",
      "[169]\tvalid_0's multi_logloss: 0.818457\n",
      "[170]\tvalid_0's multi_logloss: 0.817988\n",
      "[171]\tvalid_0's multi_logloss: 0.817538\n",
      "[172]\tvalid_0's multi_logloss: 0.817302\n",
      "[173]\tvalid_0's multi_logloss: 0.816973\n",
      "[174]\tvalid_0's multi_logloss: 0.816817\n",
      "[175]\tvalid_0's multi_logloss: 0.816554\n",
      "[176]\tvalid_0's multi_logloss: 0.816251\n",
      "[177]\tvalid_0's multi_logloss: 0.816071\n",
      "[178]\tvalid_0's multi_logloss: 0.815716\n",
      "[179]\tvalid_0's multi_logloss: 0.815511\n",
      "[180]\tvalid_0's multi_logloss: 0.81518\n",
      "[181]\tvalid_0's multi_logloss: 0.814858\n",
      "[182]\tvalid_0's multi_logloss: 0.814714\n",
      "[183]\tvalid_0's multi_logloss: 0.814467\n",
      "[184]\tvalid_0's multi_logloss: 0.814186\n",
      "[185]\tvalid_0's multi_logloss: 0.813963\n",
      "[186]\tvalid_0's multi_logloss: 0.813695\n",
      "[187]\tvalid_0's multi_logloss: 0.813304\n",
      "[188]\tvalid_0's multi_logloss: 0.812959\n",
      "[189]\tvalid_0's multi_logloss: 0.812666\n",
      "[190]\tvalid_0's multi_logloss: 0.812427\n",
      "[191]\tvalid_0's multi_logloss: 0.812119\n",
      "[192]\tvalid_0's multi_logloss: 0.811845\n",
      "[193]\tvalid_0's multi_logloss: 0.811514\n",
      "[194]\tvalid_0's multi_logloss: 0.811197\n",
      "[195]\tvalid_0's multi_logloss: 0.810974\n",
      "[196]\tvalid_0's multi_logloss: 0.8107\n",
      "[197]\tvalid_0's multi_logloss: 0.810512\n",
      "[198]\tvalid_0's multi_logloss: 0.810369\n",
      "[199]\tvalid_0's multi_logloss: 0.810194\n",
      "[200]\tvalid_0's multi_logloss: 0.810032\n",
      "[201]\tvalid_0's multi_logloss: 0.80971\n",
      "[202]\tvalid_0's multi_logloss: 0.809449\n",
      "[203]\tvalid_0's multi_logloss: 0.809216\n",
      "[204]\tvalid_0's multi_logloss: 0.809016\n",
      "[205]\tvalid_0's multi_logloss: 0.808814\n",
      "[206]\tvalid_0's multi_logloss: 0.80847\n",
      "[207]\tvalid_0's multi_logloss: 0.808322\n",
      "[208]\tvalid_0's multi_logloss: 0.808049\n",
      "[209]\tvalid_0's multi_logloss: 0.807907\n",
      "[210]\tvalid_0's multi_logloss: 0.807743\n",
      "[211]\tvalid_0's multi_logloss: 0.807475\n",
      "[212]\tvalid_0's multi_logloss: 0.807338\n",
      "[213]\tvalid_0's multi_logloss: 0.80716\n",
      "[214]\tvalid_0's multi_logloss: 0.806933\n",
      "[215]\tvalid_0's multi_logloss: 0.806738\n",
      "[216]\tvalid_0's multi_logloss: 0.806637\n",
      "[217]\tvalid_0's multi_logloss: 0.806472\n",
      "[218]\tvalid_0's multi_logloss: 0.806198\n",
      "[219]\tvalid_0's multi_logloss: 0.806096\n",
      "[220]\tvalid_0's multi_logloss: 0.805937\n",
      "[221]\tvalid_0's multi_logloss: 0.805769\n",
      "[222]\tvalid_0's multi_logloss: 0.805594\n",
      "[223]\tvalid_0's multi_logloss: 0.805425\n",
      "[224]\tvalid_0's multi_logloss: 0.805213\n",
      "[225]\tvalid_0's multi_logloss: 0.80518\n",
      "[226]\tvalid_0's multi_logloss: 0.804988\n",
      "[227]\tvalid_0's multi_logloss: 0.804827\n",
      "[228]\tvalid_0's multi_logloss: 0.804696\n",
      "[229]\tvalid_0's multi_logloss: 0.804536\n",
      "[230]\tvalid_0's multi_logloss: 0.80437\n",
      "[231]\tvalid_0's multi_logloss: 0.80417\n",
      "[232]\tvalid_0's multi_logloss: 0.804016\n",
      "[233]\tvalid_0's multi_logloss: 0.803855\n",
      "[234]\tvalid_0's multi_logloss: 0.803705\n",
      "[235]\tvalid_0's multi_logloss: 0.803511\n",
      "[236]\tvalid_0's multi_logloss: 0.803344\n",
      "[237]\tvalid_0's multi_logloss: 0.803194\n",
      "[238]\tvalid_0's multi_logloss: 0.803087\n",
      "[239]\tvalid_0's multi_logloss: 0.802983\n",
      "[240]\tvalid_0's multi_logloss: 0.802882\n",
      "[241]\tvalid_0's multi_logloss: 0.80279\n",
      "[242]\tvalid_0's multi_logloss: 0.802651\n",
      "[243]\tvalid_0's multi_logloss: 0.802477\n",
      "[244]\tvalid_0's multi_logloss: 0.802323\n",
      "[245]\tvalid_0's multi_logloss: 0.802133\n",
      "[246]\tvalid_0's multi_logloss: 0.802047\n",
      "[247]\tvalid_0's multi_logloss: 0.801921\n",
      "[248]\tvalid_0's multi_logloss: 0.801812\n",
      "[249]\tvalid_0's multi_logloss: 0.801706\n",
      "[250]\tvalid_0's multi_logloss: 0.801473\n",
      "[251]\tvalid_0's multi_logloss: 0.801338\n",
      "[252]\tvalid_0's multi_logloss: 0.801259\n",
      "[253]\tvalid_0's multi_logloss: 0.801234\n",
      "[254]\tvalid_0's multi_logloss: 0.801117\n",
      "[255]\tvalid_0's multi_logloss: 0.800988\n",
      "[256]\tvalid_0's multi_logloss: 0.80082\n",
      "[257]\tvalid_0's multi_logloss: 0.800748\n",
      "[258]\tvalid_0's multi_logloss: 0.800657\n",
      "[259]\tvalid_0's multi_logloss: 0.800477\n",
      "[260]\tvalid_0's multi_logloss: 0.800418\n",
      "[261]\tvalid_0's multi_logloss: 0.800379\n",
      "[262]\tvalid_0's multi_logloss: 0.800331\n",
      "[263]\tvalid_0's multi_logloss: 0.800302\n",
      "[264]\tvalid_0's multi_logloss: 0.800105\n",
      "[265]\tvalid_0's multi_logloss: 0.800071\n",
      "[266]\tvalid_0's multi_logloss: 0.799973\n",
      "[267]\tvalid_0's multi_logloss: 0.799958\n",
      "[268]\tvalid_0's multi_logloss: 0.799867\n",
      "[269]\tvalid_0's multi_logloss: 0.79985\n",
      "[270]\tvalid_0's multi_logloss: 0.799681\n",
      "[271]\tvalid_0's multi_logloss: 0.799615\n",
      "[272]\tvalid_0's multi_logloss: 0.799428\n",
      "[273]\tvalid_0's multi_logloss: 0.799362\n",
      "[274]\tvalid_0's multi_logloss: 0.799374\n",
      "[275]\tvalid_0's multi_logloss: 0.799144\n",
      "[276]\tvalid_0's multi_logloss: 0.799081\n",
      "[277]\tvalid_0's multi_logloss: 0.79899\n",
      "[278]\tvalid_0's multi_logloss: 0.799034\n",
      "[279]\tvalid_0's multi_logloss: 0.799006\n",
      "[280]\tvalid_0's multi_logloss: 0.798992\n",
      "[281]\tvalid_0's multi_logloss: 0.798904\n",
      "[282]\tvalid_0's multi_logloss: 0.798881\n",
      "[283]\tvalid_0's multi_logloss: 0.798841\n",
      "[284]\tvalid_0's multi_logloss: 0.798705\n",
      "[285]\tvalid_0's multi_logloss: 0.7987\n",
      "[286]\tvalid_0's multi_logloss: 0.798579\n",
      "[287]\tvalid_0's multi_logloss: 0.79849\n",
      "[288]\tvalid_0's multi_logloss: 0.79845\n",
      "[289]\tvalid_0's multi_logloss: 0.798342\n",
      "[290]\tvalid_0's multi_logloss: 0.798237\n",
      "[291]\tvalid_0's multi_logloss: 0.798153\n",
      "[292]\tvalid_0's multi_logloss: 0.79825\n",
      "[293]\tvalid_0's multi_logloss: 0.798238\n",
      "[294]\tvalid_0's multi_logloss: 0.79815\n",
      "[295]\tvalid_0's multi_logloss: 0.798117\n",
      "[296]\tvalid_0's multi_logloss: 0.798141\n",
      "[297]\tvalid_0's multi_logloss: 0.798073\n",
      "[298]\tvalid_0's multi_logloss: 0.798022\n",
      "[299]\tvalid_0's multi_logloss: 0.798016\n",
      "[300]\tvalid_0's multi_logloss: 0.797889\n",
      "[301]\tvalid_0's multi_logloss: 0.797863\n",
      "[302]\tvalid_0's multi_logloss: 0.797774\n",
      "[303]\tvalid_0's multi_logloss: 0.797766\n",
      "[304]\tvalid_0's multi_logloss: 0.797709\n",
      "[305]\tvalid_0's multi_logloss: 0.797653\n",
      "[306]\tvalid_0's multi_logloss: 0.797624\n",
      "[307]\tvalid_0's multi_logloss: 0.797611\n",
      "[308]\tvalid_0's multi_logloss: 0.797692\n",
      "[309]\tvalid_0's multi_logloss: 0.797604\n",
      "[310]\tvalid_0's multi_logloss: 0.797492\n",
      "[311]\tvalid_0's multi_logloss: 0.797442\n",
      "[312]\tvalid_0's multi_logloss: 0.797496\n",
      "[313]\tvalid_0's multi_logloss: 0.797504\n",
      "[314]\tvalid_0's multi_logloss: 0.797482\n",
      "[315]\tvalid_0's multi_logloss: 0.797418\n",
      "[316]\tvalid_0's multi_logloss: 0.797412\n",
      "[317]\tvalid_0's multi_logloss: 0.797495\n",
      "[318]\tvalid_0's multi_logloss: 0.797392\n",
      "[319]\tvalid_0's multi_logloss: 0.797488\n",
      "[320]\tvalid_0's multi_logloss: 0.797407\n",
      "[321]\tvalid_0's multi_logloss: 0.797388\n",
      "[322]\tvalid_0's multi_logloss: 0.797379\n",
      "[323]\tvalid_0's multi_logloss: 0.797332\n",
      "[324]\tvalid_0's multi_logloss: 0.797329\n",
      "[325]\tvalid_0's multi_logloss: 0.797348\n",
      "[326]\tvalid_0's multi_logloss: 0.797353\n",
      "[327]\tvalid_0's multi_logloss: 0.797299\n",
      "[328]\tvalid_0's multi_logloss: 0.79719\n",
      "[329]\tvalid_0's multi_logloss: 0.797125\n",
      "[330]\tvalid_0's multi_logloss: 0.797087\n",
      "[331]\tvalid_0's multi_logloss: 0.797055\n",
      "[332]\tvalid_0's multi_logloss: 0.797042\n",
      "[333]\tvalid_0's multi_logloss: 0.797036\n",
      "[334]\tvalid_0's multi_logloss: 0.797005\n",
      "[335]\tvalid_0's multi_logloss: 0.796994\n",
      "[336]\tvalid_0's multi_logloss: 0.796952\n",
      "[337]\tvalid_0's multi_logloss: 0.796904\n",
      "[338]\tvalid_0's multi_logloss: 0.79686\n",
      "[339]\tvalid_0's multi_logloss: 0.796811\n",
      "[340]\tvalid_0's multi_logloss: 0.796856\n",
      "[341]\tvalid_0's multi_logloss: 0.796943\n",
      "[342]\tvalid_0's multi_logloss: 0.79693\n",
      "[343]\tvalid_0's multi_logloss: 0.796992\n",
      "[344]\tvalid_0's multi_logloss: 0.796999\n",
      "[345]\tvalid_0's multi_logloss: 0.797069\n",
      "[346]\tvalid_0's multi_logloss: 0.797075\n",
      "[347]\tvalid_0's multi_logloss: 0.79711\n",
      "[348]\tvalid_0's multi_logloss: 0.797083\n",
      "[349]\tvalid_0's multi_logloss: 0.797108\n",
      "[350]\tvalid_0's multi_logloss: 0.797092\n",
      "[351]\tvalid_0's multi_logloss: 0.7971\n",
      "[352]\tvalid_0's multi_logloss: 0.797063\n",
      "[353]\tvalid_0's multi_logloss: 0.797163\n",
      "[354]\tvalid_0's multi_logloss: 0.797146\n",
      "[355]\tvalid_0's multi_logloss: 0.797089\n",
      "[356]\tvalid_0's multi_logloss: 0.797117\n",
      "[357]\tvalid_0's multi_logloss: 0.797055\n",
      "[358]\tvalid_0's multi_logloss: 0.797013\n",
      "[359]\tvalid_0's multi_logloss: 0.796967\n",
      "[360]\tvalid_0's multi_logloss: 0.79701\n",
      "[361]\tvalid_0's multi_logloss: 0.796961\n",
      "[362]\tvalid_0's multi_logloss: 0.796982\n",
      "[363]\tvalid_0's multi_logloss: 0.796971\n",
      "[364]\tvalid_0's multi_logloss: 0.797014\n",
      "[365]\tvalid_0's multi_logloss: 0.797084\n",
      "[366]\tvalid_0's multi_logloss: 0.797068\n",
      "[367]\tvalid_0's multi_logloss: 0.79712\n",
      "[368]\tvalid_0's multi_logloss: 0.797112\n",
      "[369]\tvalid_0's multi_logloss: 0.797047\n",
      "[370]\tvalid_0's multi_logloss: 0.797011\n",
      "[371]\tvalid_0's multi_logloss: 0.796952\n",
      "[372]\tvalid_0's multi_logloss: 0.797008\n",
      "[373]\tvalid_0's multi_logloss: 0.797027\n",
      "[374]\tvalid_0's multi_logloss: 0.797072\n",
      "[375]\tvalid_0's multi_logloss: 0.797011\n",
      "[376]\tvalid_0's multi_logloss: 0.797037\n",
      "[377]\tvalid_0's multi_logloss: 0.797093\n",
      "[378]\tvalid_0's multi_logloss: 0.797123\n",
      "[379]\tvalid_0's multi_logloss: 0.797085\n",
      "[380]\tvalid_0's multi_logloss: 0.797145\n",
      "[381]\tvalid_0's multi_logloss: 0.797142\n",
      "[382]\tvalid_0's multi_logloss: 0.797168\n",
      "[383]\tvalid_0's multi_logloss: 0.797048\n",
      "[384]\tvalid_0's multi_logloss: 0.797051\n",
      "[385]\tvalid_0's multi_logloss: 0.797072\n",
      "[386]\tvalid_0's multi_logloss: 0.797135\n",
      "[387]\tvalid_0's multi_logloss: 0.797126\n",
      "[388]\tvalid_0's multi_logloss: 0.797125\n",
      "[389]\tvalid_0's multi_logloss: 0.797173\n",
      "[390]\tvalid_0's multi_logloss: 0.797223\n",
      "[391]\tvalid_0's multi_logloss: 0.797254\n",
      "[392]\tvalid_0's multi_logloss: 0.797282\n",
      "[393]\tvalid_0's multi_logloss: 0.79725\n",
      "[394]\tvalid_0's multi_logloss: 0.797263\n",
      "[395]\tvalid_0's multi_logloss: 0.797274\n",
      "[396]\tvalid_0's multi_logloss: 0.797312\n",
      "[397]\tvalid_0's multi_logloss: 0.797346\n",
      "[398]\tvalid_0's multi_logloss: 0.797392\n",
      "[399]\tvalid_0's multi_logloss: 0.797442\n",
      "[400]\tvalid_0's multi_logloss: 0.79754\n",
      "[401]\tvalid_0's multi_logloss: 0.797592\n",
      "[402]\tvalid_0's multi_logloss: 0.797609\n",
      "[403]\tvalid_0's multi_logloss: 0.797539\n",
      "[404]\tvalid_0's multi_logloss: 0.797452\n",
      "[405]\tvalid_0's multi_logloss: 0.797463\n",
      "[406]\tvalid_0's multi_logloss: 0.797454\n",
      "[407]\tvalid_0's multi_logloss: 0.797486\n",
      "[408]\tvalid_0's multi_logloss: 0.797571\n",
      "[409]\tvalid_0's multi_logloss: 0.797647\n",
      "[410]\tvalid_0's multi_logloss: 0.797671\n",
      "[411]\tvalid_0's multi_logloss: 0.797748\n",
      "[412]\tvalid_0's multi_logloss: 0.797799\n",
      "[413]\tvalid_0's multi_logloss: 0.797884\n",
      "[414]\tvalid_0's multi_logloss: 0.797991\n",
      "[415]\tvalid_0's multi_logloss: 0.798006\n",
      "[416]\tvalid_0's multi_logloss: 0.798063\n",
      "[417]\tvalid_0's multi_logloss: 0.798101\n",
      "[418]\tvalid_0's multi_logloss: 0.798099\n",
      "[419]\tvalid_0's multi_logloss: 0.798064\n",
      "[420]\tvalid_0's multi_logloss: 0.798107\n",
      "[421]\tvalid_0's multi_logloss: 0.798172\n",
      "[422]\tvalid_0's multi_logloss: 0.798225\n",
      "[423]\tvalid_0's multi_logloss: 0.798284\n",
      "[424]\tvalid_0's multi_logloss: 0.798324\n",
      "[425]\tvalid_0's multi_logloss: 0.798303\n",
      "[426]\tvalid_0's multi_logloss: 0.798361\n",
      "[427]\tvalid_0's multi_logloss: 0.798481\n",
      "[428]\tvalid_0's multi_logloss: 0.798553\n",
      "[429]\tvalid_0's multi_logloss: 0.798604\n",
      "[430]\tvalid_0's multi_logloss: 0.798684\n",
      "[431]\tvalid_0's multi_logloss: 0.798808\n",
      "[432]\tvalid_0's multi_logloss: 0.798834\n",
      "[433]\tvalid_0's multi_logloss: 0.798854\n",
      "[434]\tvalid_0's multi_logloss: 0.798894\n",
      "[435]\tvalid_0's multi_logloss: 0.798891\n",
      "[436]\tvalid_0's multi_logloss: 0.798946\n",
      "[437]\tvalid_0's multi_logloss: 0.798953\n",
      "[438]\tvalid_0's multi_logloss: 0.799038\n",
      "[439]\tvalid_0's multi_logloss: 0.799105\n",
      "Early stopping, best iteration is:\n",
      "[339]\tvalid_0's multi_logloss: 0.796811\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bylevel=0.5,\n",
       "        colsample_bytree=0.5, importance_type='split', learning_rate=0.1,\n",
       "        max_depth=-1, metric='multi_logloss', min_child_samples=20,\n",
       "        min_child_weight=1.5, min_split_gain=0.0, n_estimators=2000,\n",
       "        n_jobs=16, num_class=19, num_leaves=32, objective='multiclass',\n",
       "        random_state=2019, reg_alpha=0.0, reg_lambda=10, silent=True,\n",
       "        subsample=0.7, subsample_for_bin=200000, subsample_freq=0)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model =lgb.LGBMClassifier(boosting_type='gbdt', \n",
    "                   num_leaves=2**5,\n",
    "                   max_depth=-1, \n",
    "                   learning_rate= 0.1,\n",
    "                   n_estimators=2000,\n",
    "                   objective='multiclass',\n",
    "                   subsample=0.7,#\n",
    "                   colsample_bytree=0.5,#\n",
    "                   reg_lambda=10,#l2\n",
    "                   n_jobs=16, #\n",
    "                   num_class=19,#\n",
    "                   silent=True, \n",
    "                   random_state=2019,\n",
    "#                    class_weight=20,\n",
    "                   colsample_bylevel=0.5,\n",
    "                   min_child_weight=1.5,\n",
    "                   metric='multi_logloss'\n",
    "                  )\n",
    "model.fit(X_train,y_train,eval_set=(X_eval,y_eval), early_stopping_rounds=100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 线下验证结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "#[1,2,3,2,1,3]\n",
    "#[1,2,3,1,1,3]\n",
    "def cal_macro_f1(y_true,y_pred):\n",
    "    score = f1_score(y_true,y_pred,average='macro')\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "validation score is 0.7539065473958163\n"
     ]
    }
   ],
   "source": [
    "eval_prob = model.predict_proba(X_eval) #[10,0227*19]\n",
    "eval_pred = np.argmax(eval_prob,axis=1)#[]\n",
    "eval_pred = lb.inverse_transform(eval_pred)[]\n",
    "score = cal_macro_f1(lb.inverse_transform(y_eval),eval_pred)\n",
    "print(\"validation score is\",score)\n",
    "#0.7539065473958163"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_prob = model.predict_proba(test_term_doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 结果提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred = np.argmax(test_prob,axis=1)\n",
    "test['class'] = lb.inverse_transform(test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[[\"id\",\"class\"]].to_csv(\"submission_baseline_lgb.csv\",index=False,header=True,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}